{
  "cells": [
    {
      "cell_type": "code",
      "source": [
        "import pandas as pd\n",
        "df = pd.read_csv(\"azureml://subscriptions/5f7e7551-1c5d-42a7-9cdd-7582134d0f5f/resourcegroups/iiot-book2-resources/workspaces/iiot-book2-ml-workspace1/datastores/workspaceartifactstore/paths/UI/2024-08-14_093544_UTC/wind_turbine.csv\") \n",
        "\n",
        "df "
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 1,
          "data": {
            "text/plain": "       wind_speed_ms  power_generated_kw\n0               6.07           16.972552\n1               7.43           54.418928\n2               8.19           78.077916\n3               8.19           83.096364\n4               8.19           80.569083\n...              ...                 ...\n52555           7.27           52.851684\n52556           7.28           49.878267\n52557           7.22           44.911130\n52558           7.30           50.840444\n52559           7.36           52.424180\n\n[52560 rows x 2 columns]",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>wind_speed_ms</th>\n      <th>power_generated_kw</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>6.07</td>\n      <td>16.972552</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>7.43</td>\n      <td>54.418928</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>8.19</td>\n      <td>78.077916</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>8.19</td>\n      <td>83.096364</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>8.19</td>\n      <td>80.569083</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>52555</th>\n      <td>7.27</td>\n      <td>52.851684</td>\n    </tr>\n    <tr>\n      <th>52556</th>\n      <td>7.28</td>\n      <td>49.878267</td>\n    </tr>\n    <tr>\n      <th>52557</th>\n      <td>7.22</td>\n      <td>44.911130</td>\n    </tr>\n    <tr>\n      <th>52558</th>\n      <td>7.30</td>\n      <td>50.840444</td>\n    </tr>\n    <tr>\n      <th>52559</th>\n      <td>7.36</td>\n      <td>52.424180</td>\n    </tr>\n  </tbody>\n</table>\n<p>52560 rows × 2 columns</p>\n</div>"
          },
          "metadata": {}
        }
      ],
      "execution_count": 1,
      "metadata": {
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "gather": {
          "logged": 1723728893280
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "\n",
        "import os\n",
        "import argparse\n",
        "import pandas as pd\n",
        "import mlflow\n",
        "import mlflow.sklearn\n",
        "from sklearn.ensemble import GradientBoostingRegressor\n",
        "from sklearn.metrics import r2_score\n",
        "from sklearn.model_selection import train_test_split\n",
        "from datetime import datetime\n",
        "\n",
        "\n",
        "registered_model_name=\"wind_turbine\"\n",
        "n_estimators=100\n",
        "\n",
        "\n",
        "# Start Logging\n",
        "with mlflow.start_run():\n",
        "\n",
        "    # enable autologging\n",
        "    mlflow.sklearn.autolog()\n",
        "\n",
        "\n",
        "    mlflow.log_metric(\"num_samples\", df.shape[0])\n",
        "    mlflow.log_metric(\"num_features\", df.shape[1] - 1)\n",
        "\n",
        "    #Split train and test datasets\n",
        "    train_df, test_df = train_test_split(\n",
        "        df,\n",
        "        test_size=0.3,\n",
        "    )\n",
        "    ####################\n",
        "    #</prepare the data>\n",
        "    ####################\n",
        "\n",
        "    ##################\n",
        "    #<train the model>\n",
        "    ##################\n",
        "    # Extracting the label column\n",
        "    y_train = train_df.pop(\"power_generated_kw\")\n",
        "\n",
        "    # convert the dataframe values to array\n",
        "    X_train = train_df.values\n",
        "\n",
        "    # Extracting the label column\n",
        "    y_test = test_df.pop(\"power_generated_kw\")\n",
        "\n",
        "    # convert the dataframe values to array\n",
        "    X_test = test_df.values\n",
        "\n",
        "    print(f\"Training with data of shape {X_train.shape}\")\n",
        "\n",
        "    reg = GradientBoostingRegressor(\n",
        "        n_estimators=n_estimators\n",
        "    )\n",
        "    reg.fit(X_train, y_train)\n",
        "\n",
        "    y_pred = reg.predict(X_test)\n",
        "\n",
        "    print(r2_score(y_test, y_pred))\n",
        "    mlflow.log_metric(\"r2_score_test\", r2_score(y_test, y_pred))\n",
        "    ###################\n",
        "    #</train the model>\n",
        "    ###################\n",
        "\n",
        "    ##########################\n",
        "    #<save and register model>\n",
        "    ##########################\n",
        "    # Registering the model to the workspace\n",
        "    print(\"Registering the model via MLFlow\")\n",
        "    mlflow.sklearn.log_model(\n",
        "        sk_model=reg,\n",
        "        registered_model_name=registered_model_name,\n",
        "        artifact_path=registered_model_name,\n",
        "    )\n",
        "\n",
        "    # Saving the model to a file\n",
        "    mlflow.sklearn.save_model(\n",
        "        sk_model=reg,\n",
        "        path=os.path.join(registered_model_name, \"trained_model\")\n",
        "    )\n",
        "    ###########################\n",
        "    #</save and register model>\n",
        "    ###########################\n",
        "\n",
        "    # Stop Logging\n",
        "    #mlflow.end_run()\n"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": "2024/08/15 13:38:20 WARNING mlflow.utils.autologging_utils: You are using an unsupported version of sklearn. If you encounter errors during autologging, try upgrading / downgrading sklearn to a supported version, or try upgrading MLflow.\n2024/08/15 13:38:23 WARNING mlflow.sklearn: Failed to log training dataset information to MLflow Tracking. Reason: 'Series' object has no attribute 'flatten'\n2024/08/15 13:38:29 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"/anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/_distutils_hack/__init__.py:26: UserWarning: Setuptools is replacing distutils.\"\nRegistered model 'wind_turbine' already exists. Creating a new version of this model...\n2024/08/15 13:38:40 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: wind_turbine, version 6\nCreated version '6' of model 'wind_turbine'.\n"
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": "Training with data of shape (36792, 1)\n0.9407326876025165\nRegistering the model via MLFlow\n"
        }
      ],
      "execution_count": 3,
      "metadata": {
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "gather": {
          "logged": 1723729122294
        }
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Stop compute instance\n",
        "If you're not going to use it now, stop the compute instance:\n",
        "\n",
        "* In the studio, in the left navigation area, select Compute.\n",
        "* In the top tabs, select Compute instances\n",
        "* Select the compute instance in the list.\n",
        "* On the top toolbar, select Stop."
      ],
      "metadata": {
        "nteract": {
          "transient": {
            "deleting": false
          }
        }
      }
    }
  ],
  "metadata": {
    "kernelspec": {
      "name": "python310-sdkv2",
      "language": "python",
      "display_name": "Python 3.10 - SDK v2"
    },
    "language_info": {
      "name": "python",
      "version": "3.10.14",
      "mimetype": "text/x-python",
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "pygments_lexer": "ipython3",
      "nbconvert_exporter": "python",
      "file_extension": ".py"
    },
    "microsoft": {
      "ms_spell_check": {
        "ms_spell_check_language": "en"
      },
      "host": {
        "AzureML": {
          "notebookHasBeenCompleted": true
        }
      }
    },
    "kernel_info": {
      "name": "python310-sdkv2"
    },
    "nteract": {
      "version": "nteract-front-end@1.0.0"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
}